
library(haven)
library(readr)
library(data.table)
library(scales)


panel <- fread("H:/Zheng_10223/Joint/child_year_panel2025.csv", 
               select=c("MAIN_PARENT",  "IMDB_ID", "Year", "LANDING_AGE", "BIRTH_YEAR", 
                         "Child_TIRC_IND", "Child_TIRC_HH", 
                      
                        "MainParent_TIRC_IND", "MainParent_TIRC_HH","Child_IMM_STAT"
                       )
              )



# Drop children without parents:
panel <- panel[!(MAIN_PARENT=="")]

# Impute negative income to missing
panel[panel<0] <- NA

# Need to deflate to real income (2020$)
cpi <- read_dta("H:/Zheng_10223/Joint/cpi/cpi.dta")
panel <- merge(x=panel, y=cpi, by.x="Year", by.y="year", all.x=T)
panel[,c( "Child_TIRC_IND", "Child_TIRC_HH")] <- panel[,c( "Child_TIRC_IND", "Child_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "MainParent_TIRC_IND", "MainParent_TIRC_HH")] <- panel[,c("MainParent_TIRC_IND", "MainParent_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi


# Winsorize income at top 0.1% (among immigrant sample...)
panel$Child_TIRC_IND[which(panel$Child_TIRC_IND>=quantile(panel$Child_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_IND, 0.999, na.rm=T)
panel$Child_TIRC_HH[which(panel$Child_TIRC_HH>=quantile(panel$Child_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_HH, 0.999, na.rm=T)
panel$MainParent_TIRC_HH[which(panel$MainParent_TIRC_HH>=quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T)

panel$MainParent_TIRC_IND[which(panel$MainParent_TIRC_IND>=quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T)

# Get basic landing information for main parent
landing <- read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/pnrf_1980_2022_f3_v1.dta")
landing <- data.table(landing)

# merge in parent landing info ( note: BIRTH_YEAR in "panel" is child birth year )
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","BIRTH_YEAR","gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR.x","BIRTH_YEAR.y","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","gender"), new=c("BIRTH_YEAR_child","BirthYear_MainParent","LandingYear_MainParent", "FIRST_TAX_YEAR_MainParent", "LAST_TAX_YEAR_MainParent", "DEATH_YEAR_MainParent", "CITIZEN_YEAR_MainParent","gender_MainParent"))

### Attempt to follow Chetty et al. 2014 where missing tax returns are imputed to zero
# Problem with imputing missing to zero is that a ton of children and parents never file a tax return
# Some children may have left the country (e.g., go to US for school)
# Solution: 
# i) Drop if child do not file AT LEAST ONE tax return between ages 30-34
# ii) Drop if parents do not file AT LEAST ONE tax returns within 10 years of landing
# This effectively means that any child or parent who is alive and filed at least one tax return between ages 30-34 OR within 10Y of landing, 
# will get missing tax returns imputed as zero. IF instead they do not fill a tax return following the above criteria, then they are dropped
panel[, `:=` (Child_AnyTax30_34=max(!is.na(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))])),
              Parent_AnyTaxPost10=max(!is.na(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(0:10))]))), by="IMDB_ID"]
panel <- panel[Child_AnyTax30_34==1 & Parent_AnyTaxPost10==1]

# Replace missing tax returns (provided between landing and last year) with zero
panel[, `:=` (MinDeathLeave_MainParent=pmin(DEATH_YEAR_MainParent, LAST_TAX_YEAR_MainParent, na.rm=T)), by="IMDB_ID"] # get the min between death year and last tax year




setnafill(panel, cols=c("MainParent_TIRC_HH"), fill=0)
panel[Year<FIRST_TAX_YEAR_MainParent | Year>MinDeathLeave_MainParent, c("MainParent_TIRC_HH")] <- NA

# Children: 
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "DEATH_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("DEATH_YEAR"), new="DEATH_YEAR_Child")


setnafill(panel, cols=c("Child_TIRC_HH"), fill=0)
panel[Year>DEATH_YEAR_Child, c( "Child_TIRC_IND", "Child_TIRC_HH")] <- NA


# Spouse income
panel$Child_TIRC_spouse=panel$Child_TIRC_HH - panel$Child_TIRC_IND


# Aggregate to child-level dataset with relevant child and parent incomes
sample <- panel[, .(
  # Child average total household income at 30-34
  Child_Income_HH_30_34=mean(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  # Child average total individual income at 30-34
  Child_Income_IND_30_34=mean(Child_TIRC_IND[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  
  # Main parent age 45-49 (following Connolly et al., though note that they do mothers only)
  MainParent_Income_HH_MainParentAge45_49=mean(MainParent_TIRC_HH[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Main parent age 45-49 IND (following Connolly et al., though note that they do mothers only)
  MainParent_Income_IND_MainParentAge45_49=mean(MainParent_TIRC_IND[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T)
  
), by="IMDB_ID"]




### Get income percentile
child_landing <- read_dta("H:/Zheng_10223/Joint/child_pnrf_1980_2020_f3_v1.dta")
child_landing <- data.table(child_landing)
child_landing$MAIN_PARENT=child_landing$main_parent; child_landing$main_parent=NULL
child_landing$PARENT1=child_landing$Parent1; child_landing$Parent1=NULL
child_landing$PARENT2=child_landing$Parent2; child_landing$Parent2=NULL
child_landing$CHILD_LAST_TAX_YEAR=child_landing$child_last_tax_year


# Get child and main parent landing year and landing age
sample <- merge(x=sample, y=child_landing[,c("IMDB_ID", "BIRTH_YEAR", "LANDING_YEAR", "LANDING_AGE", "PARENT1", "PARENT2", "MAIN_PARENT")], by.x="IMDB_ID", by.y="IMDB_ID")
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "LANDING_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) #Main parent's landing year
setnames(sample, old=c("LANDING_YEAR.x", "LANDING_YEAR.y", "BIRTH_YEAR"), new=c("LandingYear_Child", "LandingYear_MainParent", "BirthYear_Child"))

# get main parent last tax year
sample=merge(x=sample, y=landing[,c("IMDB_ID","LAST_TAX_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("LAST_TAX_YEAR"), new=c("LAST_TAX_YEAR_MAINPARENT"))

# get child last tax year 
sample=merge(x=sample, y=child_landing[,c("IMDB_ID","CHILD_LAST_TAX_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T) 



# Get birth year for main parent, parent 1, and parent 2 (latter 2 needed for top parent income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
setnames(sample, old=c("BIRTH_YEAR"), new=c("BirthYear_MainParent"))


# Get gender for main parent, parent 1, and parent 2 (latter 2 needed for father income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
setnames(sample, old=c("gender"), new=c("gender_MainParent"))

# Get relevant year and age needed for percentile at a year/age
sample$YearAt30 <- sample$BirthYear_Child+30; sample$YearAt10 <- sample$BirthYear_Child+10; sample$YearAt15 <- sample$BirthYear_Child+15; 
sample$MainParentYearat45 <- sample$BirthYear_MainParent+45


# Placeholders for income percentiles
sample$Child_Income_IND_30_34_pct <- NA 



sample$MainParent_Income_HH_MainParentAge45_49_pctparent<- NA

# refugee status: based on the status of the Main Parent

sample <- merge(x=sample, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
sample$Immigrantcategory_main=sample$IMMIGRATION_CATEGORY_CENSUS; sample$IMMIGRATION_CATEGORY_CENSUS=NULL

# classify
sample$ImmigrationCategory=NA
sample$ImmigrationCategory[which(sample$Immigrantcategory_main %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991'))]<- "Refugee"
sample$ImmigrationCategory[which(sample$Immigrantcategory_main %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225'))] <- "Business"
sample$ImmigrationCategory[which(sample$Immigrantcategory_main %in%  c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300'))] <- "Economic"

sample$ImmigrationCategory[which(is.na(sample$ImmigrationCategory))] <- "Missing"
sample$Status_Refugee <- (sample$ImmigrationCategory=="Refugee")
sample$Status_Unknown <- (sample$ImmigrationCategory=="Missing")


fwrite(sample, "H:/Zheng_10223/Joint/samplefile_bootstrap.csv")



############## RUN FROM HERE ######################## 


sample=fread("H:/Zheng_10223/Joint/samplefile_bootstrap.csv")


n_bootstrap=1000


# Read in all the Lad_bootstrap

# Income percentiles from the LAD (at year-age-level)
income_pct <- read.csv("H:/Zheng_10223/Joint/LAD/bootstrap/all_lad_bootstrap.csv", header=TRUE, stringsAsFactors = FALSE)
income_pct <- merge(x=income_pct, y=cpi, by.x="year", by.y="year", all.x=T)
income_pct$pctind_age30_TIRC_real <- income_pct$pctindage30*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctind_age45_TIRC_real <- income_pct$pctindage45*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi




income_pct$pctparent_age45_TIRC_real <- income_pct$pctparentage45*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct <- data.table(income_pct)
income_pct <- income_pct[order(bsamplenum,year, pct)]



resampled_data <- lapply(1:n_bootstrap, function(k) {
  set.seed(k) # set the seed
  
  
 resample <- sample[sample(nrow(sample), size = nrow(sample), replace = TRUE), ]
# Get child income percentiles (using Canadian 30year old)

 
# LAD INCOME_PCT ONLY STARTS IN 1982 
for(i in 1982:2019){
  
  # Compare income with rank/percentile of average household income AMONG PARENTS
  # calculate percentiles 
  resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)] <- findInterval(resample$Child_Income_IND_30_34[which(resample$YearAt30==i)], vec=income_pct$pctind_age30_TIRC_real[which(income_pct$year==i & income_pct$bsamplenum==k)])
  resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i & resample$Child_Income_IND_30_34_pct==min(resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)], na.rm=T))] <-0.5*(1+resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i & resample$Child_Income_IND_30_34_pct==min(resample$Child_Income_IND_30_34_pct[which(resample$YearAt30==i)], na.rm=T))])
  
  
  resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i)] <- findInterval(resample$MainParent_Income_HH_MainParentAge45_49[which(resample$MainParentYearat45==i)], vec=income_pct$pctparent_age45_TIRC_real[which(income_pct$year==i & income_pct$bsamplenum==k)])
 resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i  & resample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i )], na.rm=T))] <-0.5*(1+resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i & resample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(resample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(resample$MainParentYearat45==i )], na.rm=T))])
  
  
  print(i)
}

 # run the three regressions of interest
 model_all= lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample )
 model_refugee=lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample[resample$Status_Refugee==TRUE,] )
 model_nonrefugee=lm(Child_Income_IND_30_34_pct ~MainParent_Income_HH_MainParentAge45_49_pctparent, data=resample[resample$Status_Refugee==FALSE,] )
 
 # Store the coefficients 
 all_coef <- as.data.frame( coef(summary(model_all)) )
 all_refugee <- as.data.frame( coef(summary(model_refugee)) )
 all_nonrefugee  <- all_coef <- as.data.frame( coef(summary(model_nonrefugee)) )
 
 all_coef$bootiter=k; all_coef$EstimateType[1]="Intercept"; all_coef$EstimateType[2]="Slope"
 all_refugee$bootiter=k; all_refugee$EstimateType[1]="Intercept"; all_refugee$EstimateType[2]="Slope"
 all_nonrefugee$bootiter=k; all_nonrefugee$EstimateType[1]="Intercept"; all_nonrefugee$EstimateType[2]="Slope"
 
 all_coef$model="all"
 all_refugee$model="refugee"
 all_nonrefugee$model="nonrefugee"
 
 dfout=rbind(all_coef,all_refugee)
 dfout=rbind(dfout,all_nonrefugee)
 
 return(dfout)
  
})
coefsout=do.call(rbind, resampled_data)

fwrite(coefsout, "H:/Zheng_10223/Joint/bootstrapcoefs1_250.csv")

